Install/Import Required Packages
suppressMessages({
if(!require(knitr)){install.packages("knitr")}
if(!require(ggplot2)){install.packages("ggplot2")}
if(!require(plotly)){install.packages("plotly")}
if(!require(rpart)){install.packages("rpart")}
if(!require(rpart.plot)){install.packages("rpart.plot")}
if(!require(randomForest)){install.packages("randomForest")}
if(!require(gbm)){install.packages("gbm")}
if(!require(caret)){install.packages("caret")}
if(!require(factoextra)){install.packages("factoextra")}
if(!require(FactoMineR)){install.packages("FactoMineR")}
if (!require("pROC")) {install.packages("pROC")}
if (!require("ggdendro")) {install.packages("ggdendro")}
if (!require("mlbench")) {install.packages("mlbench")}
if (!require("pheatmap", quietly = TRUE)) {install.packages("pheatmap")}
if (!require("BiocManager", quietly = TRUE)){install.packages("BiocManager");
BiocManager::install("GEOquery")}
library(knitr)
library(ggplot2)
library(plotly)
library(MASS)
library(rpart)
library(rpart.plot)
library(randomForest)
library(gbm)
library(caret)
library(factoextra)
library(FactoMineR)
library(pROC)
library(ggdendro)
library(mlbench)
library(BiocManager)
library(GEOquery)
library(pheatmap)
})
## Warning: package 'BiocManager' was built under R version 4.4.3
We will go through three supervised learning methods in this session:
mtcars**As a simple data set, we will use the “mtcars” data set in
R. The following code can be used to display the data
set.**
library(knitr)
data("mtcars")
kable(head(mtcars, 10),caption = "Motor Trend Car Road Tests")
| mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
| Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
| Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
| Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
| Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
| Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
| Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
| Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 |
| Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 4 | 2 |
| Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
**Let’s Display the data set for Miles per Gallon
(response) and Weight (predictor).**
mtcars_plot = ggplot(mtcars, aes(wt, mpg)) +
geom_point() + xlab('Weight (1000 lbs)') + ylab('Miles Per Gallon')
ggplotly(mtcars_plot)
R function lm is used model Linear
Regression in R. Miles per Gallon (response) and
Weight (predictor).
simple_linear_regression = lm(formula = mpg ~ wt ,data = mtcars)
summary(simple_linear_regression)
##
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
library(ggplot2)
library(plotly)
linear_reg_gg = ggplot(mtcars, aes(wt, mpg))+
geom_point() + geom_smooth(method="lm", color="red",se=FALSE) +
geom_segment(aes(xend = wt, yend = (lm(mpg~wt,data = mtcars))$fitted), linetype="dashed") +
xlim(0,5.5)+
xlab('Weight (1000 lbs)') + ylab('Miles Per Gallon')
ggplotly(linear_reg_gg)
## `geom_smooth()` using formula = 'y ~ x'
Following plots is used to check the assumptions for the Linear Regression is Satisfied.
layout(matrix(1:4, byrow = T, ncol = 2))
plot(simple_linear_regression, which = 1:4)
Let’s add more input variables (or predictor variables) to the linear regression model.
\[ MPG = \beta_0 + \beta_1 Weight + \beta_2 ~ Cylinders + \beta_3 ~Rear\_axle\_Ratio + Error \]
multiple_linear_regression = lm(formula = mpg ~ wt + cyl + drat ,data = mtcars)
summary(multiple_linear_regression)
##
## Call:
## lm(formula = mpg ~ wt + cyl + drat, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2944 -1.5576 -0.4667 1.5678 6.1014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.7677 6.8729 5.786 3.26e-06 ***
## wt -3.1947 0.8293 -3.852 0.000624 ***
## cyl -1.5096 0.4464 -3.382 0.002142 **
## drat -0.0162 1.3231 -0.012 0.990317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.613 on 28 degrees of freedom
## Multiple R-squared: 0.8302, Adjusted R-squared: 0.812
## F-statistic: 45.64 on 3 and 28 DF, p-value: 6.569e-11
Since the Rear axle ratio is not significance (p-value
> 0.05), we can remove that variable and run the model again. \[ MPG = \beta_0 + \beta_1 Weight + \beta_2 ~
Cylinders + Error \]
multiple_linear_regression = lm(formula = mpg ~ wt + cyl ,data = mtcars)
summary(multiple_linear_regression)
##
## Call:
## lm(formula = mpg ~ wt + cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2893 -1.5512 -0.4684 1.5743 6.1004
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6863 1.7150 23.141 < 2e-16 ***
## wt -3.1910 0.7569 -4.216 0.000222 ***
## cyl -1.5078 0.4147 -3.636 0.001064 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.568 on 29 degrees of freedom
## Multiple R-squared: 0.8302, Adjusted R-squared: 0.8185
## F-statistic: 70.91 on 2 and 29 DF, p-value: 6.809e-12
Full Model
full_model_linear_regression = lm(formula = mpg ~ . ,data = mtcars)
summary(full_model_linear_regression)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Backward Elimination
This method starts with the full model and eliminates unimportant predictor variables from the model.
library(MASS)
backward_model = stepAIC(full_model_linear_regression, direction = "backward", trace = FALSE)
summary(backward_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
Stepwise Method
This method starts with the one variable and add or removes predictor variables systematically.
library(MASS)
stepwise_model = stepAIC(full_model_linear_regression, direction = "both", trace = FALSE)
summary(stepwise_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
Heart Disease Data SetIn this section, heart disease (heart_df) data set is
used to apply classification models.
cholesterol = c(180,200,195,245,205,130,155,260,175,286,203,192, 256, 294)
age = c(30,44,32,23,57,62,34,37,30,73,40,62, 37, 66)
heart_disease = as.numeric(c('0','1','1','1','1','0','0','1','0','1','0','1','1','1'))
heart_df = data.frame(cholesterol,age,heart_disease)
set.seed(123)
n_subjects = 50
heart_df = data.frame(
age = c(
56,63,59,48,62,54,71,67,44,53,
58,65,49,72,61,47,55,69,52,64,
45,73,60,51,68,57,46,70,66,43,
74,50,62,58,47,69,54,63,55,71,
49,67,60,52,42,56,64,45,70,53
),
cholesterol = c(
253,184,245,205,218,171,240,202,168,188,
205,230,176,245,215,190,196,238,185,222,
165,250,208,182,236,200,172,242,228,160,
255,178,217,204,171,210,250,223,198,244,
174,231,209,184,247,199,221,166,201,187
),
heart_disease = c(
1,1,1,0,1,0,1,1,0,0,
1,1,0,1,1,0,1,1,0,1,
0,1,1,0,1,0,0,1,1,0,
1,0,1,1,0,1,0,1,0,1,
0,1,1,0,1,1,1,0,0,0
))
heart_df
## age cholesterol heart_disease
## 1 56 253 1
## 2 63 184 1
## 3 59 245 1
## 4 48 205 0
## 5 62 218 1
## 6 54 171 0
## 7 71 240 1
## 8 67 202 1
## 9 44 168 0
## 10 53 188 0
## 11 58 205 1
## 12 65 230 1
## 13 49 176 0
## 14 72 245 1
## 15 61 215 1
## 16 47 190 0
## 17 55 196 1
## 18 69 238 1
## 19 52 185 0
## 20 64 222 1
## 21 45 165 0
## 22 73 250 1
## 23 60 208 1
## 24 51 182 0
## 25 68 236 1
## 26 57 200 0
## 27 46 172 0
## 28 70 242 1
## 29 66 228 1
## 30 43 160 0
## 31 74 255 1
## 32 50 178 0
## 33 62 217 1
## 34 58 204 1
## 35 47 171 0
## 36 69 210 1
## 37 54 250 0
## 38 63 223 1
## 39 55 198 0
## 40 71 244 1
## 41 49 174 0
## 42 67 231 1
## 43 60 209 1
## 44 52 184 0
## 45 42 247 1
## 46 56 199 1
## 47 64 221 1
## 48 45 166 0
## 49 70 201 0
## 50 53 187 0
heart_plot = ggplot(data = heart_df, mapping = aes(x = cholesterol, y = age, col = as.factor(heart_disease) )) +
geom_point()+
guides(color = guide_legend(title = "Heart Disease"))+
xlab('Cholestoral') + ylab('Age in Years')
ggplotly(heart_plot)
library(ggplot2)
heart_plot = ggplot(data = heart_df, mapping = aes(x = cholesterol, y = heart_disease)) +
geom_point()+
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
xlab('Cholesterol') + ylab('Heart Disease')
ggplotly(heart_plot)
## `geom_smooth()` using formula = 'y ~ x'
\[ Logit\{Y\}=\beta_0+\beta_1~Cholesterol\] \[ Y= \frac{exp(\beta_0+\beta_1~Cholesterol)}{1+exp(\beta_0+\beta_1~Cholesterol)}\]
logistic_regression = glm(formula = heart_disease ~ cholesterol ,data = heart_df,family = 'binomial')
summary(logistic_regression)
##
## Call:
## glm(formula = heart_disease ~ cholesterol, family = "binomial",
## data = heart_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -19.82515 5.50299 -3.603 0.000315 ***
## cholesterol 0.09940 0.02745 3.621 0.000293 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 68.029 on 49 degrees of freedom
## Residual deviance: 34.196 on 48 degrees of freedom
## AIC: 38.196
##
## Number of Fisher Scoring iterations: 6
\[ Logit\{Y\}=\beta_0+\beta_1~Cholesterol + \beta_2 Age\] \[ Y= \frac{exp(\beta_0+\beta_1~Cholesterol+ \beta_2 Age)}{1+exp(\beta_0+\beta_1~Cholesterol+ \beta_2 Age)}\]
logistic_regression = glm(formula = heart_disease ~ cholesterol + age ,
data = heart_df,family = 'binomial')
summary(logistic_regression)
##
## Call:
## glm(formula = heart_disease ~ cholesterol + age, family = "binomial",
## data = heart_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -25.27279 7.08238 -3.568 0.000359 ***
## cholesterol 0.06902 0.02652 2.603 0.009249 **
## age 0.20247 0.07907 2.561 0.010449 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 68.029 on 49 degrees of freedom
## Residual deviance: 26.894 on 47 degrees of freedom
## AIC: 32.894
##
## Number of Fisher Scoring iterations: 6
Get odd ratios from parameters estimates:
exp(coef(logistic_regression))
## (Intercept) cholesterol age
## 1.057226e-11 1.071458e+00 1.224426e+00
Predict for new set of data with Logistic Regression:
predict(logistic_regression,
newdata = data.frame(cholesterol = c(150, 190), age = c(55,75)),
type = "response")
## 1 2
## 0.02223167 0.95375721
Partition the data set into decision boundaries.
library(rpart)
library(rpart.plot)
ctrl = list(cp = 0.00001, minbucket = 1, maxdepth = 2)
tree = rpart(mpg ~ wt+cyl, data = mtcars,control = ctrl, method = 'anova')
rpart.plot(tree)
library(ggplot2)
ggplot(mtcars, aes(y=wt, x=cyl, color=mpg)) +
geom_point() +
geom_hline(yintercept=2.3, linetype="dashed",
color = "red", linewidth=1) +
geom_vline(xintercept=7, linetype="dashed",
color = "purple", linewidth=1)
R code for Random Forest Regression
library(randomForest)
Random_Forest_model = randomForest(formula = mpg ~ wt+cyl+drat, data = mtcars, ntree=500, importance=TRUE)
important_features = as.data.frame(importance(Random_Forest_model))
important_features$feature_name = row.names(important_features)
ggplot(important_features, aes(x=feature_name, y=`%IncMSE`)) +
geom_segment( aes(x=feature_name, xend=feature_name, y=0, yend=`%IncMSE`), color="orange") +
geom_point(aes(size = IncNodePurity), color="darkorange", alpha=0.9) +
theme_minimal() +
coord_flip() +
theme( legend.position="None") +
xlab('Features') + ylab('Importance')
library(randomForest)
Random_Forest_Classifier = randomForest(as.factor(heart_disease) ~ cholesterol + age, data=heart_df, ntree=500, importance = TRUE)
important_features = as.data.frame(importance(Random_Forest_Classifier))
important_features$feature_name = row.names(important_features)
ggplot(important_features, aes(x=feature_name, y=`MeanDecreaseGini`)) +
geom_segment( aes(x=feature_name, xend=feature_name, y=0, yend=`MeanDecreaseGini`), color="orange") +
geom_point(aes(size = MeanDecreaseAccuracy), color="darkorange", alpha=0.9) +
theme_minimal() +
coord_flip() +
theme(legend.position = "none") +
xlab('Features') + ylab('Importance')
Predict New Data with Random Forest:
predict( Random_Forest_Classifier,
data.frame(cholesterol = c(150, 190), age = c(55,75)),type = "prob")
## 0 1
## 1 0.726 0.274
## 2 0.388 0.612
## attr(,"class")
## [1] "matrix" "array" "votes"
A person with age 55 years and cholesterol level of 150 is predicted to be not getting an heart disease
A person with age 75 years and cholesterol level of 190 is predicted to be getting an heart disease
Probability of getting a heart disease for a person with age 55 years and cholesterol level of 150 is \(\approx 82\%\).
Probability of getting a heart disease for a person with age 75 years and cholesterol level of 190 is 99 %.
Boosting Method
library(gbm)
set.seed(123)
Gradient_Boost_Regression = gbm(
formula = mpg ~ wt + cyl + drat,
data = mtcars,
n.trees = 2000,
n.minobsinnode = 5
)
## Distribution not specified, assuming gaussian ...
summary(Gradient_Boost_Regression)
## var rel.inf
## wt wt 52.83806
## drat drat 32.11113
## cyl cyl 15.05080
set.seed(123)
Gradient_Boost_Classifier = gbm(heart_disease ~ cholesterol + age,
data=heart_df,
n.trees = 2000)
## Distribution not specified, assuming bernoulli ...
summary(Gradient_Boost_Classifier)
## var rel.inf
## cholesterol cholesterol 50.28782
## age age 49.71218
test_data = data.frame(cholesterol = c(150, 190), age = c(55,75))
pred_gbm = predict( object = Gradient_Boost_Classifier, newdata = test_data,
n.trees = Gradient_Boost_Classifier$n.trees, type="response")
pred_gbm
## [1] 0.8239065 0.9999572
Confusion matrix, Sensitivity and Specificity
library(caret)
random_forest_prediction = predict(Random_Forest_Classifier)
observed_data = as.factor(heart_df$heart_disease)
confusionMatrix( random_forest_prediction, observed_data,positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 17 5
## 1 4 24
##
## Accuracy : 0.82
## 95% CI : (0.6856, 0.9142)
## No Information Rate : 0.58
## P-Value [Acc > NIR] : 0.0002834
##
## Kappa : 0.633
##
## Mcnemar's Test P-Value : 1.0000000
##
## Sensitivity : 0.8276
## Specificity : 0.8095
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.7727
## Prevalence : 0.5800
## Detection Rate : 0.4800
## Detection Prevalence : 0.5600
## Balanced Accuracy : 0.8186
##
## 'Positive' Class : 1
##
ROC Curve
library(pROC)
roc_logistic = roc(heart_df$heart_disease, as.numeric(predict(logistic_regression)),smoothed = TRUE,plot=TRUE, grid=TRUE,print.auc=TRUE)
roc_random_forest = roc(heart_df$heart_disease, as.numeric(random_forest_prediction),smoothed = TRUE,plot=TRUE, grid=TRUE,print.auc=TRUE)
#plot(roc_logistic, main = "ROC Curve")
#lines(roc_random_forest,col='blue')
Let’s use mtcars data set to apply train-test split. We
split as train data set will have 70% and test data set will have
30%.
set.seed(1234)
sample = sample(c(TRUE, FALSE), nrow(mtcars), replace=TRUE, prob=c(0.7,0.3))
mtcars_train = mtcars[sample, ]
mtcars_test = mtcars[!sample, ]
Train Data Set Size
dim(mtcars_train)
## [1] 26 11
Test Data Set Size
dim(mtcars_test)
## [1] 6 11
Apply k-fold cross validation for mtcars training data
set.
Cross Validation with Linear Regression
library(caret)
cross_validate_setup = trainControl(method = "cv", number = 4) # Cross validation with 4 folds
linear_regression_cv = train(mpg ~ .,
data = mtcars_train,
trControl = cross_validate_setup,
method = "lm",
metric = 'RMSE' # RMSE is used to compare the model
)
linear_regression_cv
## Linear Regression
##
## 26 samples
## 10 predictors
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 19, 19, 20, 20
## Resampling results:
##
## RMSE Rsquared MAE
## 3.765965 0.7585956 2.974284
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
Cross Validation with Random Forest Regression
library(caret)
cross_validate_setup = trainControl(method = "cv", number = 4) # Cross validation with 4 folds
random_forest_regression_cv = train(mpg ~ .,
data = mtcars_train,
trControl = cross_validate_setup,
method = "rf", # Random Forest
metric = 'RMSE',
tuneGrid = expand.grid(.mtry = c(1 : 10)), #Grid Search with mtry from 1 to 10
na.action = na.pass)
random_forest_regression_cv
## Random Forest
##
## 26 samples
## 10 predictors
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 18, 21, 20, 19
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 1 2.685053 0.8574958 2.111840
## 2 2.426804 0.8694192 1.875677
## 3 2.376424 0.8723288 1.751273
## 4 2.366303 0.8671525 1.705216
## 5 2.340886 0.8646725 1.654538
## 6 2.316080 0.8670054 1.647052
## 7 2.386482 0.8635178 1.702981
## 8 2.315735 0.8655307 1.658174
## 9 2.344355 0.8587729 1.674820
## 10 2.283882 0.8637430 1.647277
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 10.
Random Forest Regression with mtry = 10 is the best
model compared to the Linear Regression as the RMSE is the lowest.
Lets apply K-Means Clustering for mtcars data
set
set.seed(123)
k_means_clustering = kmeans(mtcars, centers = 2)
print(k_means_clustering$cluster)
## Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
## 1 1 1 1
## Hornet Sportabout Valiant Duster 360 Merc 240D
## 2 1 2 1
## Merc 230 Merc 280 Merc 280C Merc 450SE
## 1 1 1 2
## Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
## 2 2 2 2
## Chrysler Imperial Fiat 128 Honda Civic Toyota Corolla
## 2 1 1 1
## Toyota Corona Dodge Challenger AMC Javelin Camaro Z28
## 1 2 2 2
## Pontiac Firebird Fiat X1-9 Porsche 914-2 Lotus Europa
## 2 1 1 1
## Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
## 2 1 2 1
Plot K-Means Clustering using factoextra
library
library(factoextra)
fviz_cluster(k_means_clustering, data = mtcars)
The following two methods can be used to check best number of clusters.
fviz_nbclust(mtcars, kmeans, method = "silhouette")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_nbclust(mtcars, kmeans, method = "wss")
Lets apply Hierarchical-Means Clustering for
mtcars data set
library(ggdendro)
set.seed(123)
mtcars_distance = dist(mtcars, method = 'euclidean')
h_clustering = hclust(mtcars_distance)
ggdendrogram(h_clustering)
Hierarchical-Means Clustering based heat-maps are popular in analyzing gene expression data
Data Source: https://bioconductor.org/packages/devel/data/experiment/html/celldex.html
library(BiocManager)
library(GEOquery)
library(pheatmap)
gse = getGEO("GSE33126")
dim((exprs(gse[[1]])))
## [1] 48803 18
gene_exp = t(exprs(gse[[1]]))
data_subset = as.matrix(gene_exp[,colSums(gene_exp)>500000])
dist = dist(t(data_subset) , diag=TRUE)
hc = hclust(dist)
dhc = as.dendrogram(hc)
#ggdendrogram(hc)
pheatmap(scale(data_subset))
Principal components reduce the data dimensions and allows to visualize the correlation structure of the data effectively
library(FactoMineR)
pca_mtcars = PCA(mtcars, graph = FALSE)
fviz_pca_biplot(pca_mtcars, repel = TRUE)